To correct for batch effects, we will use the Harmony package, since it was reported to be the fastest and most accurate method in a recent benchmarking effort. To this end, we will follow the “Integration of datasets using Harmony” vignette of the SeuratWrappers package.
Importantly, we will use the harmony coordinates (batch-corrected principal components) to compute a k-nearest neighbors (KNN) graph, which will be used to compute the proportion of doublet nearest neighbors (pDNN) for each cell (see the following notebook).
library(Seurat)
library(SeuratWrappers)
library(harmony)
library(tidyverse)
# Paths
path_to_obj <- here::here("scRNA-seq/results/R_objects/seurat_merged_all.rds")
path_to_save_obj <- here::here("scRNA-seq/results/R_objects/seurat_integrated_with_doublets.rds")
path_tmp_dir <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/")
path_to_save_knn <- str_c(path_tmp_dir, "integrated_knn_graph.rds", sep = "")
path_to_save_doubl_annot <- str_c(path_tmp_dir, "doublet_preliminary_annotations.rds", sep = "")
# Set k for the KNN graph
k <- 75
tonsil <- readRDS(path_to_obj)
tonsil
## An object of class Seurat
## 28504 features across 347262 samples within 1 assay
## Active assay: RNA (28504 features, 0 variable features)
To normalize by sequencing depth, we will divide each count by the library size of the cell (total number of UMI) and log-transform it, similarly to other high-quality atlases, like the thymus and the heart atlas.
tonsil <- NormalizeData(
tonsil,
normalization.method = "LogNormalize",
scale.factor = 1e4
)
tonsil[["RNA"]]@data[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##
## AL627309.1 . . . . . . . . . .
## AL627309.3 . . . . . . . . . .
## AL627309.5 . . . . . . . . . .
## AL627309.4 . . . . . . . . . .
## AP006222.2 . . . . . . . . . .
## AL669831.2 . . . . . . . . . .
## LINC01409 . . . . . . . 1.119716 . 0.2179193
## FAM87B . . . . . . . . . .
## LINC01128 . . . . . . . . . .
## LINC00115 . . . . . . . . . .
tonsil <- tonsil %>%
FindVariableFeatures(nfeatures = 3000) %>%
ScaleData() %>%
RunPCA() %>%
RunUMAP(reduction = "pca", dims = 1:30)
p1 <- DimPlot(tonsil, group.by = "library_name", pt.size = 0.1) + NoLegend()
p2 <- DimPlot(tonsil, group.by = "sex", pt.size = 0.1)
p3 <- DimPlot(tonsil, group.by = "age_group", pt.size = 0.1)
p4 <- DimPlot(tonsil, group.by = "is_hashed", pt.size = 0.1)
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
p1
print("UMAP colored by sex:")
## [1] "UMAP colored by sex:"
p2
print("UMAP colored by age:")
## [1] "UMAP colored by age:"
p3
print("UMAP colored by whether it's hashed:")
## [1] "UMAP colored by whether it's hashed:"
p4
tonsil <- tonsil %>%
RunHarmony(reduction = "pca", dims = 1:30, group.by.vars = "gem_id") %>%
RunUMAP(reduction = "harmony", dims = 1:30)
p1_corr <- DimPlot(tonsil, group.by = "library_name", pt.size = 0.1) +
NoLegend()
p2_corr <- DimPlot(tonsil, group.by = "sex", pt.size = 0.1)
p3_corr <- DimPlot(tonsil, group.by = "age_group", pt.size = 0.1)
p4_corr <- DimPlot(tonsil, group.by = "is_hashed", pt.size = 0.1)
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
p1_corr
print("UMAP colored by sex:")
## [1] "UMAP colored by sex:"
p2_corr
print("UMAP colored by age:")
## [1] "UMAP colored by age:"
p3_corr
print("UMAP colored by whether it's hashed:")
## [1] "UMAP colored by whether it's hashed:"
p4_corr
tonsil <- FindNeighbors(
tonsil,
reduction = "harmony",
dims = 1:30,
k.param = k,
compute.SNN = FALSE
)
tonsil
## An object of class Seurat
## 28504 features across 347262 samples within 1 assay
## Active assay: RNA (28504 features, 3000 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
# If it doesn't exist create temporal directory
dir.create(path_tmp_dir, showWarnings = FALSE)
# Save
selected_cols <- c("HTO_classification.global", "has_high_lib_size",
"scrublet_predicted_doublet", "scrublet_doublet_scores",
"scrublet_doublet_scores_scaled")
doublet_annot_df <- tonsil@meta.data[, selected_cols]
knn_graph <- as(tonsil@graphs$RNA_nn, "sparseMatrix")
saveRDS(knn_graph, path_to_save_knn)
saveRDS(tonsil, path_to_save_obj)
#saveRDS(tonsil@graphs$RNA_nn, path_to_save_knn)
saveRDS(doublet_annot_df, path_to_save_doubl_annot)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
##
## Matrix products: default
## BLAS: /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.0 tidyverse_1.3.0 harmony_1.0 Rcpp_1.0.6 SeuratWrappers_0.2.0 Seurat_3.2.0 BiocStyle_2.14.4
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 deldir_0.1-25 ellipsis_0.3.1 ggridges_0.5.2 rprojroot_1.3-2 fs_1.4.1 rstudioapi_0.11 spatstat.data_1.4-3 farver_2.0.3 leiden_0.3.3 listenv_0.8.0 remotes_2.2.0 ggrepel_0.8.2 RSpectra_0.16-0 fansi_0.4.1 lubridate_1.7.8 xml2_1.3.2 codetools_0.2-16 splines_3.6.0 knitr_1.28 polyclip_1.10-0 jsonlite_1.7.2 broom_0.5.6 ica_1.0-2 cluster_2.1.0 dbplyr_1.4.4 png_0.1-7 uwot_0.1.8 shiny_1.4.0.2 sctransform_0.2.1 BiocManager_1.30.10 compiler_3.6.0 httr_1.4.2 backports_1.1.7 assertthat_0.2.1 Matrix_1.2-18 fastmap_1.0.1 lazyeval_0.2.2 cli_2.0.2 later_1.0.0 htmltools_0.4.0 tools_3.6.0 rsvd_1.0.3 igraph_1.2.5 gtable_0.3.0 glue_1.4.1 RANN_2.6.1 reshape2_1.4.4 rappdirs_0.3.1 spatstat_1.64-1 cellranger_1.1.0 vctrs_0.3.6 ape_5.3
## [55] nlme_3.1-148 lmtest_0.9-37 xfun_0.14 globals_0.12.5 rvest_0.3.5 mime_0.9 miniUI_0.1.1.1 lifecycle_0.2.0 irlba_2.3.3 goftest_1.2-2 future_1.17.0 MASS_7.3-51.6 zoo_1.8-8 scales_1.1.1 hms_0.5.3 promises_1.1.0 spatstat.utils_1.17-0 parallel_3.6.0 RColorBrewer_1.1-2 yaml_2.2.1 reticulate_1.16 pbapply_1.4-2 gridExtra_2.3 rpart_4.1-15 stringi_1.4.6 rlang_0.4.10 pkgconfig_2.0.3 evaluate_0.14 lattice_0.20-41 ROCR_1.0-11 tensor_1.5 labeling_0.3 patchwork_1.0.0 htmlwidgets_1.5.1 cowplot_1.0.0 tidyselect_1.1.0 here_0.1 RcppAnnoy_0.0.16 plyr_1.8.6 magrittr_1.5 bookdown_0.19 R6_2.4.1 generics_0.0.2 DBI_1.1.0 withr_2.4.1 pillar_1.4.4 haven_2.3.1 mgcv_1.8-31 fitdistrplus_1.1-1 survival_3.1-12 abind_1.4-5 future.apply_1.5.0 modelr_0.1.8 crayon_1.3.4
## [109] KernSmooth_2.23-17 plotly_4.9.2.1 rmarkdown_2.2 readxl_1.3.1 grid_3.6.0 data.table_1.12.8 blob_1.2.1 reprex_0.3.0 digest_0.6.20 xtable_1.8-4 httpuv_1.5.3.1 munsell_0.5.0 viridisLite_0.3.0